-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[LV]: Improve accuracy of calculating remaining iterations of MainLoopVF #156723
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Hassnaa Hamdi (hassnaaHamdi) ChangesAccount for vscale for vscale-based TC when calculating remaining iterations. Patch is 63.28 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156723.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3fbeef1211954..ebdd3a69cf0e1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4411,8 +4411,21 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
const SCEV *TC =
vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE);
assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
+
+ // TODO: Maybe this could be removed when SCEV can evaluate expressions with
+ // 'vscale'.
+ // If TC is multiple of vscale, try to get estimated value:
+ if (match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale()))) {
+ std::optional<ElementCount> BestKnownTC =
+ getSmallBestKnownTC(PSE, OrigLoop);
+ if (BestKnownTC) {
+ unsigned EstimatedRuntimeTC =
+ estimateElementCount(*BestKnownTC, CM.getVScaleForTuning());
+ TC = SE.getConstant(TCType, EstimatedRuntimeTC);
+ }
+ }
RemainingIterations =
- SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
+ SE.getURemExpr(TC, SE.getElementCount(TCType, EstimatedRuntimeVF * IC));
// No iterations left to process in the epilogue.
if (RemainingIterations->isZero())
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index 6b0da1bb2ed82..d1f13a344ecbf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -10,8 +10,7 @@
target triple = "aarch64-linux-gnu"
; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_16'
-; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
-; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:vscale x 8, Epilogue Loop UF:1
+; DEBUG: Executing best plan with VF=vscale x 16, UF=2
; DEBUG-FORCED: LV: Checking a loop in 'main_vf_vscale_x_16'
; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
@@ -20,61 +19,33 @@ target triple = "aarch64-linux-gnu"
define void @main_vf_vscale_x_16(ptr %A) #0 {
; CHECK-LABEL: @main_vf_vscale_x_16(
-; CHECK-NEXT: iter.check:
+; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 5
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK: vector.main.loop.iter.check:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 5
-; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 1024, [[TMP3]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 32
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP5]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 4
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 [[TMP18]]
-; CHECK-NEXT: store <vscale x 16 x i8> splat (i8 1), ptr [[TMP14]], align 1
-; CHECK-NEXT: store <vscale x 16 x i8> splat (i8 1), ptr [[TMP19]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 4
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP6]]
+; CHECK-NEXT: store <vscale x 16 x i8> splat (i8 1), ptr [[TMP4]], align 1
+; CHECK-NEXT: store <vscale x 16 x i8> splat (i8 1), ptr [[TMP7]], align 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
-; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP22:%.*]] = shl nuw i64 [[TMP21]], 3
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP22]]
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK: vec.epilog.ph:
-; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 8
-; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 1024, [[TMP24]]
-; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 1024, [[N_MOD_VF2]]
-; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX5]]
-; CHECK-NEXT: store <vscale x 8 x i8> splat (i8 1), ptr [[TMP28]], align 1
-; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], [[TMP24]]
-; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
-; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 1024, [[N_VEC3]]
-; CHECK-NEXT: br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
;
@@ -94,32 +65,32 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-VF8: vector.body:
; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF8-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-VF8-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4
-; CHECK-VF8-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[TMP16]]
-; CHECK-VF8-NEXT: store <vscale x 16 x i8> splat (i8 1), ptr [[TMP12]], align 1
-; CHECK-VF8-NEXT: store <vscale x 16 x i8> splat (i8 1), ptr [[TMP17]], align 1
+; CHECK-VF8-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 4
+; CHECK-VF8-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP6]]
+; CHECK-VF8-NEXT: store <vscale x 16 x i8> splat (i8 1), ptr [[TMP4]], align 1
+; CHECK-VF8-NEXT: store <vscale x 16 x i8> splat (i8 1), ptr [[TMP7]], align 1
; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-VF8-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF8-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-VF8: middle.block:
; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK-VF8: vec.epilog.iter.check:
; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
-; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK-VF8: vec.epilog.ph:
; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK-VF8: vec.epilog.vector.body:
; CHECK-VF8-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-VF8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX1]]
-; CHECK-VF8-NEXT: store <8 x i8> splat (i8 1), ptr [[TMP20]], align 1
+; CHECK-VF8-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX1]]
+; CHECK-VF8-NEXT: store <8 x i8> splat (i8 1), ptr [[TMP9]], align 1
; CHECK-VF8-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
-; CHECK-VF8-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
-; CHECK-VF8-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF8-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
+; CHECK-VF8-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-VF8: vec.epilog.middle.block:
; CHECK-VF8-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK-VF8: vec.epilog.scalar.ph:
@@ -147,22 +118,22 @@ define void @main_vf_vscale_x_2_no_epi_iteration(ptr %A) #0 vscale_range(8, 8) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i64 [[TMP8]]
-; CHECK-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP6]], align 1
-; CHECK-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP9]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 [[TMP4]]
+; CHECK-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP2]], align 1
+; CHECK-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP5]], align 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -187,32 +158,32 @@ define void @main_vf_vscale_x_2_no_epi_iteration(ptr %A) #0 vscale_range(8, 8) {
; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-VF8: vector.body:
; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF8-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-VF8-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 1
-; CHECK-VF8-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i64 [[TMP8]]
-; CHECK-VF8-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP6]], align 1
-; CHECK-VF8-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP9]], align 1
+; CHECK-VF8-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 1
+; CHECK-VF8-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i64 [[TMP6]]
+; CHECK-VF8-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP4]], align 1
+; CHECK-VF8-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP7]], align 1
; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-VF8-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF8-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-VF8: middle.block:
; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK-VF8: vec.epilog.iter.check:
; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
-; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
; CHECK-VF8: vec.epilog.ph:
; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK-VF8: vec.epilog.vector.body:
; CHECK-VF8-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-VF8-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX1]]
-; CHECK-VF8-NEXT: store <8 x i64> splat (i64 1), ptr [[TMP11]], align 1
+; CHECK-VF8-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX1]]
+; CHECK-VF8-NEXT: store <8 x i64> splat (i64 1), ptr [[TMP9]], align 1
; CHECK-VF8-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
-; CHECK-VF8-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
-; CHECK-VF8-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF8-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
+; CHECK-VF8-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-VF8: vec.epilog.middle.block:
; CHECK-VF8-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK-VF8: vec.epilog.scalar.ph:
@@ -267,51 +238,51 @@ define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 1
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 [[TMP16]]
-; CHECK-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP12]], align 1
-; CHECK-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i64 [[TMP6]]
+; CHECK-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP4]], align 1
+; CHECK-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP7]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]]
; CHECK: vec.epilog.ph:
-; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[SCALAR_PH]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 8
; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[IN...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for this @hassnaaHamdi!
if (match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale()))) { | ||
std::optional<ElementCount> BestKnownTC = | ||
getSmallBestKnownTC(PSE, OrigLoop); | ||
if (BestKnownTC) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: I think you can just do:
if (std::optional<ElementCount> BestKnownTC =
getSmallBestKnownTC(PSE, OrigLoop)) {
...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you regenerate the CHECK lines for this test in a seperate pre-commit patch please? It's not your fault, but it's quite hard to see which test changes are actually related to the code changes.
c4a830c
to
388abf6
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
388abf6
to
b1acc49
Compare
There is a failure because this file |
if (match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale()))) { | ||
if (std::optional<ElementCount> BestKnownTC = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For the case when both TC and VF are multiples of vscale
this is effectively doing the opposite of the PR's title in that we're going from potentially knowing the exact value for RemainingIterations
to having a guess based on the tuning option?
I've uploaded a WIP patch (#157836, which is loosely blocked on #141798) where I'm improving SCEV's handling for vscale
to improve control flow for vscale based loops.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, this is not the most optimal solution that's why there is a TODO depending on SCEV.
For the mentioned case the current logic doesn't know the exact value for RemainingIterations
, while the proposed solution approximately decides it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess I'm suggestion we should restrict the use of estimateElementCount
to only those cases where the values being compared are in different domains (i.e. one is fixed and the other scalable). That is the scenario where vscale
becomes problematic and providing an estimate is our only option and typically better than doing nothing.
Sure, there are scalable TC and VF cases where we don't do a great job, but often that's due to missing optimisation and/or analysis.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX6]] | ||
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <vscale x 2 x float>, ptr [[TMP24]], align 4 | ||
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX6]] | ||
; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 2 x float>, ptr [[TMP25]], align 4 | ||
; CHECK-NEXT: [[TMP26:%.*]] = fmul <vscale x 2 x float> [[WIDE_LOAD7]], [[WIDE_LOAD8]] | ||
; CHECK-NEXT: store <vscale x 2 x float> [[TMP26]], ptr [[TMP25]], align 4 | ||
; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], [[TMP23]] | ||
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC5]] | ||
; CHECK-NEXT: br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks worse? perhaps an artefact of what I mentioned above?
The loop has a trip count of 1033*vscale
, the main VF is 4*vscale
, so after the vector loop there will be (1033-(1033/4))*vscale ==> vscale
iterations remaining, yet the epilogue loop now has a VF of 2*vscale
and thus will never execute?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is an issue related to deciding the correct VF for the epilogue, it's fixed here:
#156724
b1acc49
to
02ce6db
Compare
// different numerical space. | ||
// TODO: This is a workaround until SCEV can evaluate vscale-based | ||
// expressions. | ||
if (match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale())) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's probably worth caching this result in a variable since it's used in two places.
// TODO: This is a workaround until SCEV can evaluate vscale-based | ||
// expressions. | ||
if (match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale())) && | ||
MainLoopVF.isScalable()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It might be a bit clearer if you handle all cases where the numerical space is the same here, i.e.
bool ScalableTC = match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale()));
if (ScalableTC == MainLoopVF.isScalable())
RemainingIterations =
SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
else {
if (ScalableTC) {
...
RemainingIterations = | ||
SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC)); | ||
} else { | ||
if (match(TC, m_scev_Mul(m_SCEV(), m_SCEVVScale()))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure if this is right, because what about when the trip count is tc = mul i64 %n, %vscale
, i.e. unknown?
Don't you just want to create a SCEV mul expression based on CM.getVScaleForTuning
, i.e. if you did something like:
const SCEV *KnownMinTC;
bool ScalableTC = match(TC, m_scev_Mul(m_SCEV(KnownMinTC), m_SCEVVScale()));
...
if (ScalableTC) {
TC = SE.getMulExpr(KnownMinTC, SE.getConstant(CM.getVScaleForTuning()));
}
I think this patch is not needed now as #157836 is landed. SCEV now can evaluate vscale-based expressions. |
Sorry, I got confused about the SCEV patch. It handles the case when both TC and VF are vscale-based only. |
Change-Id: I5683aa86f39e92fdeea5089268cedc3c07f0bb8c
02ce6db
to
7b7ff40
Compare
Change-Id: I24d2181a61d8bcaaaf2c0efb4275226a6887aabc
7b7ff40
to
526f82f
Compare
|
||
// No iterations left to process in the epilogue. | ||
if (RemainingIterations->isZero()) | ||
if (!RemainingIterations || RemainingIterations->isZero()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this change needed? I think RemainingIterations should always be non-null because it's always set to the value returned by getURemExpr. The worst thing that can happen is that it returns a pointer to a SCEVUnknown object, but isZero will just return false.
} else { | ||
if (ScalableTC) | ||
RemainingIterations = SE.getURemExpr( | ||
KnownMinTC, SE.getElementCount(TCType, MainLoopVF * IC)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you need to multiply KnownMinTC
by CM.getVScaleForTuning()
to get a more accurate estimate.
RemainingIterations = | ||
SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC)); | ||
} else { | ||
if (ScalableTC) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can fold this if into the else above, i.e.
if (ScalableTC == MainLoopVF.isScalable())
RemainingIterations =
SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
else if (ScalableTC)
RemainingIterations = SE.getURemExpr(
KnownMinTC, SE.getElementCount(TCType, MainLoopVF * IC));
else
RemainingIterations = SE.getURemExpr(
TC, SE.getElementCount(TCType, EstimatedRuntimeVF * IC));
; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_16' | ||
; DEBUG: Create Skeleton for epilogue vectorized loop (first pass) | ||
; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 | ||
; DEBUG: Executing best plan with VF=vscale x 16, UF=2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe worth adding a DEBUG-NOT: Create Skeleton for epilogue vectorized loop
here as well because at the moment you're only testing what VF we chose for the main loop.
SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC)); | ||
else if (ScalableTC) { | ||
const SCEV *EstimatedTC = SE.getMulExpr( | ||
KnownMinTC, SE.getConstant(TCType, CM.getVScaleForTuning().value())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you need CM.getVScaleForTuning().value_or(1) here because the value is only available if the vscale_range attribute is present on the function.
bool ScalableTC = match(TC, m_scev_Mul(m_SCEV(KnownMinTC), m_SCEVVScale())) || | ||
match(TC, m_scev_Mul(m_SCEVVScale(), m_SCEV(KnownMinTC))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bool ScalableTC = match(TC, m_scev_Mul(m_SCEV(KnownMinTC), m_SCEVVScale())) || | |
match(TC, m_scev_Mul(m_SCEVVScale(), m_SCEV(KnownMinTC))); | |
bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale())); |
Transform TC and VF to same numerical space when they are in different spaces.